{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d5aaf762",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from huggingface_hub import snapshot_download\n",
    "\n",
    "# snapshot_download(\n",
    "#     repo_type='dataset',\n",
    "#     repo_id=\"mesolitica/Sampling-Multitask-National-Speech-Corpus-v1\", \n",
    "#     allow_patterns='*.zip', local_dir = './')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "80fb61bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "f = hf_hub_download(\n",
    "    repo_id=\"mesolitica/Sampling-Multitask-National-Speech-Corpus-v1\",\n",
    "    repo_type='dataset',\n",
    "    filename=\"data/train-00000-of-00001.parquet\",\n",
    "    local_dir=\"./Sampling-Multitask-National-Speech-Corpus-v1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9cabb7b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "import zipfile\n",
    "import os\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "\n",
    "def loop(files):\n",
    "    files, _ = files\n",
    "    for zip_file_path in tqdm(files):\n",
    "        destination_folder = './'\n",
    "        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(destination_folder)\n",
    "        os.remove(zip_file_path)\n",
    "\n",
    "files = glob('*.zip')\n",
    "if len(files):\n",
    "    multiprocessing(files, loop, cores = min(len(files), 20), returned = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "31b02952",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b95e27f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-Audio-7B-Instruct\")\n",
    "tokenizer = processor.tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5c58eb70",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': 'What was the reason behind Speaker1 crying during the parent-teacher meeting?',\n",
       " 'answer': 'Speaker1 cried because their dad scolded them for not reading up, as instructed by their teachers.',\n",
       " 'audio_filename': 'sampling-audio/SQA-PART3-Train-audio_train-00153-of-00171-0.mp3',\n",
       " 'start': None,\n",
       " 'end': None,\n",
       " 'context': None,\n",
       " 'system': None,\n",
       " 'sliced_audio_filename': None}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_parquet(f).to_dict(orient = 'records')\n",
    "df[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "6f3fed36",
   "metadata": {},
   "outputs": [],
   "source": [
    "conversation = [\n",
    "    {'role': 'system', 'content': 'You are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.'},\n",
    "    {\"role\": \"user\", \"content\": [\n",
    "        {\"type\": \"audio\", \"audio_url\": \"audio.wav\"},\n",
    "        {\"type\": \"text\", \"text\": df[0]['instruction']},\n",
    "    ]},\n",
    "    {\"role\": \"assistant\", \"content\": df[0]['answer']},\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "7b0d8dc6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<|im_start|>system\\nYou are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.<|im_end|>\\n<|im_start|>user\\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\\nWhat was the reason behind Speaker1 crying during the parent-teacher meeting?<|im_end|>\\n<|im_start|>assistant\\nSpeaker1 cried because their dad scolded them for not reading up, as instructed by their teachers.<|im_end|>\\n'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor.apply_chat_template(conversation, tokenize=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "95584495",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        f = r['audio_filename']\n",
    "        if not os.path.exists(f):\n",
    "            continue\n",
    "            \n",
    "        try:\n",
    "            conversation = [\n",
    "                {'role': 'system', 'content': 'You are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.'},\n",
    "                {\"role\": \"user\", \"content\": [\n",
    "                    {\"type\": \"audio\", \"audio_url\": \"audio.wav\"},\n",
    "                    {\"type\": \"text\", \"text\": r['instruction']},\n",
    "                ]},\n",
    "                {\"role\": \"assistant\", \"content\": r['answer']},\n",
    "            ]\n",
    "            text = processor.apply_chat_template(conversation, tokenize=False)\n",
    "        except Exception as e:\n",
    "            continue\n",
    "        \n",
    "\n",
    "        data.append({\n",
    "            'text': text,\n",
    "            'audio': f,\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e4e5aae7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4281.21it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed = loop((df[-10:], 0))\n",
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "3d16f495",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"<|im_start|>system\\nYou are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.<|im_end|>\\n<|im_start|>user\\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\\nPlease transcribe.<|im_end|>\\n<|im_start|>assistant\\n<Speaker1>: Previously they also ask Glen and #bi-hun# for a lot of help, Especially Hannah. He will ask #bi-hun# for a lot of help. because #bi-hun# and Hannah. <Speaker2>: But the whole time they were there also [ah] <Speaker1>: Same course, [what] no [lah] they will not [lah] <Speaker2>: Ya [loh] <Speaker1>: I am sure they will did their own stuff. Also like only hand wipe machine, but really a lot of work. I think like Hannah need to do a lot of things ya, Hannah, #Ru-Shin# <Speaker2>: A lot work. <Speaker1>: That's why I say you really will be prepared, because you need to juggle with studies cause you need to go for talks. You don't go get loans. You knew like all this kind of big stuff is all is you do, [eh]<|im_end|>\\n\",\n",
       " 'audio': 'sampling-audio/ASR-PART3-Train-audio_train-00012-of-00171-553.mp3'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "dd189e8d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5347.86it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5368.13it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5198.79it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 4976.36it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5094.28it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5296.65it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5284.89it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5338.74it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5334.40it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5286.28it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 4898.63it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5345.38it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5126.69it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5371.91it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5384.21it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5390.94it/s]\n",
      " 45%|████████████████████████████████████████████▉                                                        | 2109/4734 [00:00<00:00, 5383.00it/s]\n",
      " 68%|█████████████████████████████████████████████████████████████████████                                | 3238/4734 [00:00<00:00, 5537.85it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5456.95it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5479.00it/s]\n",
      " 56%|████████████████████████████████████████████████████████▋                                            | 2657/4734 [00:00<00:00, 5445.17it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5469.66it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 4735.31it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5460.97it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 4738.86it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5474.69it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5453.88it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5445.16it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5491.28it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 4771.25it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5487.95it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(df, loop, cores = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "58547e05",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepare-Sampling-Multitask-National-Speech-Corpus-v1.json', 'w') as fopen:\n",
    "    json.dump(processed, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
